library(reshape2) # melt function
library(ggplot2) # ggplot function
library(pcaPP) # Fast Kendall function
library(energy) # Distance Correlation
library(Hmisc) # Hoeffding's D measure
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(entropy) # Mutual Information
library(minerva) # Maximum Information Coefficient
library(XICOR) # Chatterjee's Coefficient
library(dHSIC) # Hilbert Schmidt Independence Criterion
library(VineCopula) # Blomqvist's Beta
source("data_generation.R")
my_beta <- function(x, y){
# following Blomqvist, N. (1950). On a measure of dependence between two
# random variables. The Annals of Mathematical Statistics, 21(4), 593-600.
mx <- median(x); my <- median(y)
n1 <- sum(x < mx & y > my) + sum(x > mx & y < my)
n2 <- sum(x < mx & y < my) + sum(x > mx & y > my)
return ((n1 - n2)/(n1 + n2))
}
my_XICOR <- function(x, y){
return(max(XICOR::calculateXI(x,y), XICOR::calculateXI(y,x)))
}
generate_quadratic <- function(n){
t(sapply(1:n, function(i){
x <- stats::runif(1, min = -1, max = 1)
y <- x^2
c(x,y)
}))
}
generate_quadratic2 <- function(n){
t(sapply(1:n, function(i){
x <- stats::runif(1, min = -1, max = 1)
y <- x^2 + stats::rnorm(1, mean = 0, sd = 0.3)
c(x,y)
}))
}
gen_vertical1 <- function(n){
t(sapply(1:n, function(i){
x <- stats::runif(1, min = -2, max = 2)
y <- 0
c(x,y)
}))
}
gen_vertical2 <- function(n){
t(sapply(1:n, function(i){
x <- 0
y <- stats::runif(1, min = -2, max = 2)
c(x,y)
}))
}
generate_power <- function(n){
t(sapply(1:n, function(i){
x <- stats::runif(1, min = 0, max = 2)
y <- x^10 + stats::rnorm(1, mean = 10, sd = 10)
c(x,y)
}))
}
gen_linear <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
x <- stats::rnorm(1, mean = 2.5, sd = 1)
y <- x + stats::rnorm(1, mean = 0, sd = 1)
c(x,y)
}))
}
gen_strong_linear <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
x <- stats::rnorm(1, mean = 2.5, sd = 1)
y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
c(x,y)
}))
}
gen_sine <- function(n){
t(sapply(1:n, function(i){
x <- stats::runif(1, min = -2, max = 12)
y <- sin(x) + stats::rnorm(1, mean = 0, sd = 0.1)
c(x,y)
}))
}
quadratic_1 <- generate_quadratic(100)
plot(quadratic_1)

exper_n <- 500
ver1 <- gen_vertical1(exper_n)
plot(ver1)

ver2 <- gen_vertical2(exper_n)
plot(ver2)

cross <- rbind(ver1, ver2)
plot(cross)

power <- generate_power(exper_n)
plot(power)

quadratic2 <- generate_quadratic2(400)
plot(quadratic2)

sine <- gen_sine(exper_n)
plot(sine)

Dcor vs Pearson
plot(quadratic_1)

energy::dcor(quadratic_1[,1],quadratic_1[,2])
## [1] 0.5102642
stats::cor(quadratic_1, method = "pearson")
## [,1] [,2]
## [1,] 1.0000000 -0.2608609
## [2,] -0.2608609 1.0000000
#stats::cor(quadratic_1, method = "spearman")
Things to consider
lollipop <- .generate_lollipop(500)
plot(lollipop)

stats::cor(lollipop)
## [,1] [,2]
## [1,] 1.0000000 0.8452204
## [2,] 0.8452204 1.0000000
energy::dcor(lollipop[,1],lollipop[,2])
## [1] 0.8798029
XI vs Spearman
Linear
lin <- gen_linear(200)
plot(lin)

my_XICOR(lin[,1], lin[,2])
## [1] 0.2872572
stats::cor(lin, method = "spearman")
## [,1] [,2]
## [1,] 1.0000000 0.7313043
## [2,] 0.7313043 1.0000000
Quadratic
my_XICOR(quadratic_1[,1], quadratic_1[,2])
## [1] 0.9411941
stats::cor(quadratic_1, method = "spearman")
## [,1] [,2]
## [1,] 1.0000000 -0.2044164
## [2,] -0.2044164 1.0000000
Sine
my_XICOR(sine[,1], sine[,2])
## [1] 0.8224473
stats::cor(sine, method = "spearman")
## [,1] [,2]
## [1,] 1.00000000 -0.06390755
## [2,] -0.06390755 1.00000000
Hoeffding’s D vs XI
quadratic
Hmisc::hoeffd(sine[,1], sine[,2])$D
## x y
## x 1.00000000 0.05549161
## y 0.05549161 1.00000000
my_XICOR(sine[,1], sine[,2])
## [1] 0.8224473
plot(quadratic_1)

Hmisc::hoeffd(quadratic_1[,1], quadratic_1[,2])$D
## x y
## x 1.0000000 0.2523113
## y 0.2523113 1.0000000
my_XICOR(quadratic_1[,1], quadratic_1[,2])
## [1] 0.9411941
Quadratic with noise
plot(quadratic2)

Hmisc::hoeffd(quadratic2[,1], quadratic2[,2])$D
## x y
## x 1.00000000 0.04664275
## y 0.04664275 1.00000000
my_XICOR(quadratic2[,1], quadratic2[,2])
## [1] 0.3069769
Beta vs Spearman
steep exponential
plot(power)

stats::cor(power, method = "spearman")
## [,1] [,2]
## [1,] 1.0000000 0.7491827
## [2,] 0.7491827 1.0000000
my_beta(power[,1], power[,2])
## [1] -0.592
gen_quadrant <- function(n){
t(sapply(1:n, function(i){
rand_int <- sample(1:4, 1)
if (rand_int == 1){
x <- stats::runif(1, min = 0, max = 3)
y <- stats::runif(1, min = 0, max = 3)
} else if (rand_int == 2){
x <- stats::runif(1, min = 0, max = 4)
y <- stats::runif(1, min = 10, max = 14)
} else if (rand_int == 3){
x <- stats::runif(1, min = 10, max = 14)
y <- stats::runif(1, min = 0, max = 4)
} else {
x <- stats::runif(1, min = 10, max = 14)
y <- stats::runif(1, min = 10, max = 14)
}
c(x,y)
}))
}
quadrant <- gen_quadrant(exper_n)
mean(quadrant[, 1])
## [1] 6.540044
mean(quadrant[, 2])
## [1] 6.86236
plot(quadrant)

Beta vs MI
my_beta(quadrant[, 1], quadrant[, 2])
## [1] -0.104
mi.empirical(entropy::discretize2d(as.matrix(quadrant[, 1]), as.matrix(quadrant[, 2]), numBins1 = 20, numBins2 = 20))
## Warning in KL.plugin(freqs2d, freqs.null, unit = unit): Vanishing value(s) in
## argument freqs2!
## [1] 0.1956087
Beta vs Kendall
plot(quadrant)

my_beta(quadrant[, 1], quadrant[, 2])
## [1] -0.104
stats::cor(quadrant, method = "kendall")
## [,1] [,2]
## [1,] 1.00000000 0.07547896
## [2,] 0.07547896 1.00000000
#minerva::mine(quadrant)$MIC
plot(lin)

my_beta(lin[, 1], lin[, 2])
## [1] -0.56
stats::cor(lin, method = "kendall")
## [,1] [,2]
## [1,] 1.0000000 0.5319598
## [2,] 0.5319598 1.0000000
#minerva::mine(quadrant)$MIC
MIC vs MI
plot(quadratic_1)

minerva::mine(quadratic_1)$MIC
## [,1] [,2]
## [1,] 1 1
## [2,] 1 1
mi.empirical(entropy::discretize2d(as.matrix(quadratic_1[, 1]), as.matrix(quadratic_1[, 2]), numBins1 = 10, numBins2 = 10))
## [1] 1.306421
#minerva::mine(quadrant)$MIC
2. Explore more about the lollipop
get_measures <- function(dat){
pearson_cor <- stats::cor(dat, method = "pearson")[1,2]
spearman_cor <- stats::cor(dat, method = "spearman")[1,2]
kendall_cor <- pcaPP::cor.fk(dat)[1,2]
dist_cor <- energy::dcor(dat[,1], dat[,2])
hoeff_cor <- Hmisc::hoeffd(dat[,1], dat[,2])$D[1,2]
MI_cor <- entropy::mi.empirical(entropy::discretize2d(as.matrix(dat[, 1]),
as.matrix(dat[, 2]),
numBins1 = 10, numBins2 = 10))
MIC_cor <- minerva::mine(dat)$MIC[1,2]
XICOR_cor <- my_XICOR(dat[,1], dat[,2])
HSIC_cor <- dHSIC::dhsic(dat[,1], dat[, 2])$dHSIC
beta_cor <- my_beta(dat[, 1], dat[, 2])
return(c(pearson_cor, spearman_cor, kendall_cor, dist_cor, hoeff_cor,
MI_cor, MIC_cor, XICOR_cor, HSIC_cor, beta_cor))
}
2-1. Original lollipop
original <- .generate_lollipop
set.seed(11)
lollipop1 <- original(500)
plot(lollipop1)

lolli1_measure <- get_measures(lollipop1)
2-2. Three clusters in total
new_lollipop2 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(1:3, 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 1)
y <- stats::rnorm(1, mean = 0, sd = 1)
# generate "bridge"
} else if (k == 2){
x <- stats::rnorm(1, mean = 1, sd = 0.5)
y <- stats::rnorm(1, mean = 2, sd = 0.5)
} else {
# k == 3, generate "stick"
x <- stats::rnorm(1, mean = 3, sd = 0.8)
y <- x + stats::rnorm(1, mean = 1, sd = 0.5)
}
c(x,y)
}))
}
set.seed(10)
new_pop2 <- new_lollipop2(500)
plot(new_pop2)

lolli2_measure <- get_measures(new_pop2)
2-3. Two clusters that continously connect the circle and stick
set.seed(10)
new_lollipop3 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(1:2, 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 1)
y <- stats::rnorm(1, mean = 0, sd = 1)
} else {
# k == 2, generate "stick"
x <- stats::rnorm(1, mean = 1.5, sd = 0.8)
y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
}
c(x,y)
}))
}
new_pop3 <- new_lollipop3(500)
plot(new_pop3)

stats::cor(new_pop3, method = "pearson")[1,2]
## [1] 0.6320267
energy::dcor(new_pop3[,1],new_pop3[,2])
## [1] 0.6309555
lolli3_measure <- get_measures(new_pop3)
2-4.The lollipop that has a very long stick
set.seed(15)
new_lollipop4 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(1:2, 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 1)
y <- stats::rnorm(1, mean = 0, sd = 1)
} else {
# k == 2, generate "stick"
x <- stats::rnorm(1, mean = 6, sd = 4)
y <- x + stats::rnorm(1, mean = 1, sd = 0.5)
}
c(x,y)
}))
}
new_pop4 <- new_lollipop4(500)
plot(new_pop4)

lolli4_measure <- get_measures(new_pop4)
2-5. The lollipop that has less longer stick
set.seed(230)
new_lollipop5 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(c(1,2), 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 0.5)
y <- stats::rnorm(1, mean = 1, sd = 0.5)
} else {
# k == 2, generate "stick"
x <- stats::rnorm(1, mean = 1, sd = 1)
y <- x + stats::rnorm(1, mean = 1, sd = 0.3)
}
c(x,y)
}))
}
new_pop5 <- new_lollipop5(500)
plot(new_pop5)

lolli5_measure <- get_measures(new_pop5)
2-6. The lollipop that has less dense stick in the circle
set.seed(230)
new_lollipop6 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(c(1,2), 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 1)
y <- stats::rnorm(1, mean = 0, sd = 1)
} else {
# k == 2, generate "stick"
x <- stats::rnorm(1, mean = 0, sd = 1)
y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
}
c(x,y)
}))
}
new_pop6 <- new_lollipop6(500)
plot(new_pop6)

lolli6_measure <- get_measures(new_pop6)
2-7. The lollipop that has very dense stick in the circle
set.seed(230)
new_lollipop7 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(c(1,2), 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 2)
y <- stats::rnorm(1, mean = 0, sd = 2)
} else {
# k == 2, generate "stick"
x <- stats::rnorm(1, mean = 0, sd = 0.5)
y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
}
c(x,y)
}))
}
new_pop7 <- new_lollipop7(600)
plot(new_pop7)

lolli7_measure <- get_measures(new_pop7)
## Warning in KL.plugin(freqs2d, freqs.null, unit = unit): Vanishing value(s) in
## argument freqs2!
2-8. The lollipop that has multiple clusters in the ball
set.seed(230)
new_lollipop8 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(1:5, 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
} else if (k == 2) {
x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
} else if (k == 3) {
x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
} else if (k == 4) {
x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
} else {
# Otherwise, generate "stick"
x <- stats::rnorm(1, mean = 1.5, sd = 0.8)
y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
}
c(x,y)
}))
}
new_pop8 <- new_lollipop8(700)
plot(new_pop8)

lolli8_measure <- get_measures(new_pop8)
lolli8_measure
## [1] 0.53934296 0.36347619 0.25379113 0.50476354 0.05230268 0.32225962
## [7] 0.34178603 0.21055757 0.01286012 -0.20571429
2-9. The lollipop that has multiple clusters in the ball and more weights on stick than previous one
set.seed(230)
new_lollipop9 <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(1:6, 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
} else if (k == 2) {
x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
} else if (k == 3) {
x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
} else if (k == 4) {
x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
} else {
# Otherwise, generate "stick"
x <- stats::rnorm(1, mean = 1, sd = 1)
y <- x + stats::rnorm(1, mean = 0.5, sd = 0.3)
}
c(x,y)
}))
}
new_pop9 <- new_lollipop9(800)
plot(new_pop9)

lolli9_measure <- get_measures(new_pop9)
combined_measure <- rbind(lolli1_measure, lolli2_measure, lolli3_measure,
lolli4_measure, lolli5_measure, lolli6_measure,
lolli7_measure, lolli8_measure, lolli9_measure)
rownames(combined_measure) <- 1:9
colnames(combined_measure) <- c("pearson", "spearman", "kendall", "dist_cor", "hoeff_D",
"MI", "MIC", "XICOR", "HSIC", "Blomq_beta")
combined_measure
## pearson spearman kendall dist_cor hoeff_D MI MIC
## 1 0.85360836 0.8496716 0.6607615 0.8884833 0.40401811 0.7799179 0.9153044
## 2 0.82197254 0.8256502 0.6452745 0.8336293 0.35008693 0.7097939 0.7308506
## 3 0.63202671 0.6461776 0.4778998 0.6309555 0.18896672 0.4371723 0.4373949
## 4 0.96865997 0.8447599 0.6949579 0.9701998 0.45858001 1.0792695 0.9253863
## 5 0.82823173 0.7293480 0.5559118 0.8087890 0.26510779 0.6273113 0.6310743
## 6 0.44854941 0.4548433 0.3461162 0.4538884 0.09886414 0.3112321 0.3008466
## 7 0.03037599 0.2145965 0.2000779 0.2653970 0.06169904 0.1751000 0.3223880
## 8 0.53934296 0.3634762 0.2537911 0.5047635 0.05230268 0.3222596 0.3417860
## 9 0.65040054 0.5171377 0.3823467 0.6132621 0.12358580 0.3871556 0.4311486
## XICOR HSIC Blomq_beta
## 1 0.6018384 0.09836303 -0.8880000
## 2 0.5233341 0.05951117 -0.5840000
## 3 0.3166453 0.03271560 -0.5280000
## 4 0.6392066 0.09954329 -0.7360000
## 5 0.4473378 0.04862815 -0.5520000
## 6 0.1725967 0.01988712 -0.4320000
## 7 0.1593810 0.01630651 -0.3333333
## 8 0.2105576 0.01286012 -0.2057143
## 9 0.3001927 0.02390519 -0.3200000
melt(combined_measure)
## Var1 Var2 value
## 1 1 pearson 0.85360836
## 2 2 pearson 0.82197254
## 3 3 pearson 0.63202671
## 4 4 pearson 0.96865997
## 5 5 pearson 0.82823173
## 6 6 pearson 0.44854941
## 7 7 pearson 0.03037599
## 8 8 pearson 0.53934296
## 9 9 pearson 0.65040054
## 10 1 spearman 0.84967156
## 11 2 spearman 0.82565025
## 12 3 spearman 0.64617762
## 13 4 spearman 0.84475989
## 14 5 spearman 0.72934804
## 15 6 spearman 0.45484329
## 16 7 spearman 0.21459654
## 17 8 spearman 0.36347619
## 18 9 spearman 0.51713769
## 19 1 kendall 0.66076152
## 20 2 kendall 0.64527455
## 21 3 kendall 0.47789980
## 22 4 kendall 0.69495792
## 23 5 kendall 0.55591182
## 24 6 kendall 0.34611623
## 25 7 kendall 0.20007791
## 26 8 kendall 0.25379113
## 27 9 kendall 0.38234668
## 28 1 dist_cor 0.88848326
## 29 2 dist_cor 0.83362930
## 30 3 dist_cor 0.63095547
## 31 4 dist_cor 0.97019977
## 32 5 dist_cor 0.80878897
## 33 6 dist_cor 0.45388839
## 34 7 dist_cor 0.26539701
## 35 8 dist_cor 0.50476354
## 36 9 dist_cor 0.61326212
## 37 1 hoeff_D 0.40401811
## 38 2 hoeff_D 0.35008693
## 39 3 hoeff_D 0.18896672
## 40 4 hoeff_D 0.45858001
## 41 5 hoeff_D 0.26510779
## 42 6 hoeff_D 0.09886414
## 43 7 hoeff_D 0.06169904
## 44 8 hoeff_D 0.05230268
## 45 9 hoeff_D 0.12358580
## 46 1 MI 0.77991786
## 47 2 MI 0.70979386
## 48 3 MI 0.43717231
## 49 4 MI 1.07926952
## 50 5 MI 0.62731131
## 51 6 MI 0.31123211
## 52 7 MI 0.17510004
## 53 8 MI 0.32225962
## 54 9 MI 0.38715557
## 55 1 MIC 0.91530440
## 56 2 MIC 0.73085065
## 57 3 MIC 0.43739491
## 58 4 MIC 0.92538633
## 59 5 MIC 0.63107431
## 60 6 MIC 0.30084656
## 61 7 MIC 0.32238804
## 62 8 MIC 0.34178603
## 63 9 MIC 0.43114856
## 64 1 XICOR 0.60183841
## 65 2 XICOR 0.52333409
## 66 3 XICOR 0.31664527
## 67 4 XICOR 0.63920656
## 68 5 XICOR 0.44733779
## 69 6 XICOR 0.17259669
## 70 7 XICOR 0.15938100
## 71 8 XICOR 0.21055757
## 72 9 XICOR 0.30019266
## 73 1 HSIC 0.09836303
## 74 2 HSIC 0.05951117
## 75 3 HSIC 0.03271560
## 76 4 HSIC 0.09954329
## 77 5 HSIC 0.04862815
## 78 6 HSIC 0.01988712
## 79 7 HSIC 0.01630651
## 80 8 HSIC 0.01286012
## 81 9 HSIC 0.02390519
## 82 1 Blomq_beta -0.88800000
## 83 2 Blomq_beta -0.58400000
## 84 3 Blomq_beta -0.52800000
## 85 4 Blomq_beta -0.73600000
## 86 5 Blomq_beta -0.55200000
## 87 6 Blomq_beta -0.43200000
## 88 7 Blomq_beta -0.33333333
## 89 8 Blomq_beta -0.20571429
## 90 9 Blomq_beta -0.32000000